# Load virtual environment
# This script details how the trained models developed in this thesis can be applied to make predictions in new datasets.
# The script uses the example of the CAPP model, but the script can be easily adapted for any of the models.

# Load imports
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score
import pickle

# set working directory
os.chdir("/../../..")

# Load the saved CAPP model, trained on the complete training dataset, oversampled by 300% and undersampled to balance class proportions.
filename = 'CAPP.sav'
model = pickle.load(open(filename, 'rb'))

# Define function to calcuate performance metrics
def performance(y_test, y_pred, y_probs):
	cm_test = confusion_matrix(y_test, y_pred)	
	test_report = classification_report(y_test, y_pred)
	accuracy = accuracy_score(y_test, y_pred)
	balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	LRp = sensitivity/(1-specificity)
	LRn = (1-sensitivity)/specificity
	F1 = f1_score(y_test, y_pred)
	ROCAUC = roc_auc_score(y_test, y_probs)
	PR_AUC = average_precision_score(y_test, y_probs)
	print (cm_test)
	print (test_report)
	print('accuracy:=%f' % (accuracy))
	print('balanced_accuracy:=%f' % (balanced_accuracy))
	print('Sensitivity:=%f' % (sensitivity))
	print('Specificity:=%f' % (specificity))
	print('PPV:=%f' % (PPV))
	print('NPV:=%f' % (NPV))
	print('LRp:=%f' % (LRp))
	print('LRn:=%f' % (LRn))
	print('ROCAUC:=%f' % (ROCAUC))
	print('PR_AUC:=%f' % (PR_AUC))
	return

# Load new dataset for which you want to make predictions on. 
	# The data should be in the format where each individual's data is on a single row, and each column is the data for each predictor in the model (encoded correctly as continuous/categorical). 
	# The order of the columns is important. For all models (CAPE/CAPP indivdual/integrated models), relevant predictors should be in the order of:
	# maternal age, birthweight, age of solid food introduction, BMI at age 1, BMI at preschool age, PRS, newborn/childhood MRS, total breastfeeding duration, early life wheeze, early life cough, preschool wheeze, preschool cough, preschool nocturnal symptoms, preschool atopy, preschool polysensitisation status, maternal socioeconomic status.
	# The final column should contain the asthma outcome, necessary for evaluating the accuracy of the predictions made. 
	# In this code, all continuous variables are in the first few columns. In this example, the CAPP model has 5 continuous variables in the first five predictor columns.

new_data= pd.read_csv("/../new_data_to_make_predictions.csv", index_col=False)

# Standardise continuous variables within this dataset
	# In this example, the CAPP model has 5 continuous variables in the first five predictor columns.
scaler = StandardScaler()
cont_predictors = pd.DataFrame(scaler.fit_transform(new_data.iloc[:,0:5]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'BMI_1', 'BMI_5'))
cat_predictors = new_data.iloc[:,5:]
new_data_standardised = pd.concat([cont_predictors, cat_predictors.reset_index(drop=True)], axis=1)

# Split data into predictors and outcome
X_test = new_data.drop(['Asthma'], axis=1)
y_test = new_data['Asthma']

#Predict asthma status for each individual in the dataset
y_pred = model.predict(X_test)
probs = model.predict_proba(X_test)
y_probs= probs[:,1]
performance(y_test, y_pred, y_probs)
